source("../rscripts/package.R")

1 Background, observations, questions

Each GF binds to a receptor and triggers a cascade that includes ERK plus others. But it is important for the cell to respond adequatly to the signal, e.g. proliferation vs differentiation. Each GF induces a different type of response when applied in a sustained fashion:

Clustering gives us ways to see these different behaviours, but we would like a way to put numbers on these differences in behaviour, maybe eventually to automatically detect them.

Data look like:

# Remove unnecessary columns for clarity
head(Yanni)
##    Condition   Label    Ratio RealTime Metadata_Series Metadata_T
## 1:    E-0.25 12_0002 1183.396        0              12          0
## 2:    E-0.25 12_0002 1187.431        2              12          1
## 3:    E-0.25 12_0002 1172.491        4              12          2
## 4:    E-0.25 12_0002 1176.407        6              12          3
## 5:    E-0.25 12_0002 1172.421        8              12          4
## 6:    E-0.25 12_0002 1167.917       10              12          5
##    TrackObjects_Label         Stim_All_Ch               Stim_All_S
## 1:                  2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
## 2:                  2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
## 3:                  2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
## 4:                  2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
## 5:                  2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
## 6:                  2 EGF 0.25 ng/ml sust S12: EGF 0.25 ng/ml sust
del.cols <- names(Yanni)[5:9]
Yanni[, (del.cols) := NULL]
Yanni
##         Condition   Label    Ratio RealTime
##      1:    E-0.25 12_0002 1183.396        0
##      2:    E-0.25 12_0002 1187.431        2
##      3:    E-0.25 12_0002 1172.491        4
##      4:    E-0.25 12_0002 1176.407        6
##      5:    E-0.25 12_0002 1172.421        8
##     ---                                    
## 195640:     N-250 14_0051 1278.354      352
## 195641:     N-250 14_0051 1276.160      354
## 195642:     N-250 14_0051 1282.459      356
## 195643:     N-250 14_0051 1280.433      358
## 195644:     N-250 14_0051 1282.518      360
ggplot(Yanni, aes(y=Ratio, x=RealTime)) + geom_line(aes(group=Label)) + facet_wrap(~Condition) + scale_y_continuous(limits = c(1000, 1750)) + scale_x_continuous(limits = c(0,200)) + stat_summary(fun.y=mean, geom="line", colour = "blue", size = 1.5) + theme(text = element_text(size = 25))

2 Clustering quality

Idea: Hclust, Kmeans… Use the clustering quality criteria (David-Bouldin, Dunn…) as a measure of distance between the clusters.

Here Kmeans/Kmedoids seem appropriate, we guess that the number of clusters should be no more than let’s say 10.

2.1 Data normalization

Prior to clustering, normalize data since the computations of distances between TS always relie on some kind of Euclidean distance. We use the fold-change “Korean way”, which normalizes in a per trajectory fashion.

  • Time range used for normalization: 0-36
  • Trim x-axis (time): 0-200
# Normalization
Yanni <- myNorm(in.dt = Yanni, in.meas.col = "Ratio", in.rt.min = 0, in.rt.max = 36, in.by.cols = c("Condition", "Label"), in.type = "fold.change")
# X trimming
Yanni <- Yanni[RealTime <= 200]

2.2 Smoothing

Erase small variations in the signal that are not relevant for distance computation, smooth out high frequency variations (noise).

Yanni[, Ratio.norm.smooth := rollex(Ratio.norm, k = 5), by = c("Condition", "Label")]
Yanni
##         Condition   Label    Ratio RealTime Ratio.norm Ratio.norm.smooth
##      1:    E-0.25 12_0002 1183.396        0  1.0072021         1.0056096
##      2:    E-0.25 12_0002 1187.431        2  1.0106363         1.0042921
##      3:    E-0.25 12_0002 1172.491        4  0.9979203         1.0029746
##      4:    E-0.25 12_0002 1176.407        6  1.0012535         1.0003396
##      5:    E-0.25 12_0002 1172.421        8  0.9978608         0.9995891
##     ---                                                                 
## 117487:     N-250 14_0051 1381.819      192  1.1640162         1.1499901
## 117488:     N-250 14_0051 1366.671      194  1.1512561         1.1506136
## 117489:     N-250 14_0051 1323.189      196  1.1146276         1.1313087
## 117490:     N-250 14_0051 1353.097      198  1.1398211         1.1216562
## 117491:     N-250 14_0051 1290.181      200  1.0868222         1.1120037
# Plot first trajetory
par(mfrow=c(3,1))
plot(Yanni[Condition=="E-0.25" & Label=="12_0002", Ratio], type = "b", ylab = "value", cex.main=3, cex.axis=2.5, main = "Raw")
plot(Yanni[Condition=="E-0.25" & Label=="12_0002", Ratio.norm], type = "b", ylab = "value", cex.main=3, cex.axis=2.5, main = "Normalized")
plot(Yanni[Condition=="E-0.25" & Label=="12_0002", Ratio.norm.smooth], type = "b", ylab = "value", cex.main=3, cex.axis=2.5, main = "Normalized and smoothed")

2.3 Clustering indices

Will use a method based on DTW, much more tailored to TS than classical Euclidean distance. In addition we use partionning around medoids instead of Kmeans, these are basically the same methods except that medoids are much less sensitive to outliers. Instead of picking the mean of a cluster as a center, medoids pick the data point that is the closest to the middle of the cluster.

Though the method is dependant on the initialization, changing the seed doesn’t affect the clustering that much.

CastCluster <- function(data, time.col, condition.col, label.col, measure.col, k.clust, na.fill, plot = T, return.quality = T, ...){
  # Cast to wide, cluster and get quality indices
  require(dtwclust)
  temp <- myCast(data, time.col, condition.col, label.col, measure.col, na.fill)
  
  # Make clustering, and get quality indexes
  clust <- tsclust(temp$casted.matrix, type = "partitional", k = k.clust, distance = "dtw_basic", centroid = "pam", seed = 42, ...)
  names(clust) <- paste0("k_", k.clust)
  quality <- sapply(clust, cvi, type = "internal")
  
  # Add a column with the clusters to the casted table
  cluster.table <- temp$casted
  for(k in 1:length(clust)){
    cluster.table <- cbind(clust[[k]]@cluster, cluster.table)
    colnames(cluster.table)[1] <- names(clust)[k]
  }
  
  # Plot
  if(plot){
    require(ggplot2)
    mquality <- melt(quality)
    names(mquality) <- c("Stat", "Nb.Clust", "value")
    plot(ggplot(mquality, aes(x=Nb.Clust, y = value)) + geom_col(aes(group = Nb.Clust, fill = Nb.Clust)) + facet_grid(Stat ~ ., scales = "free_y"))
  }
  
  # Output
  if(return.quality) return(list(cluster = clust, table = cluster.table, quality = quality))
  else return(list(out = clust, table = cluster.table))
}


myCast <- function(data, time.col, condition.col, label.col, measure.col, na.fill){
  # Only cast to wide matrix
  temp <- copy(data)
  # dcast can change the order of the rows depending on the orde rin which the keyed columns are passed, keep the casted table in addition to the matrix to make the link afterwards
  temp <- dcast(temp, get(condition.col) + get(label.col) ~ get(time.col), value.var = measure.col)
  temp2 <- as.matrix(temp[, c(-1, -2)]) # remove 2 first columns with labels
  temp2[which(is.na(temp2))] <- na.fill
  return(list(casted = temp, casted.matrix = temp2))
}


plot_cluster <- function(data, id.vars.col, cluster.col, type){
  # Plot clusters directly from output$table of CastCluster
  # id.vars.col: given in indices (include ALL clustering columns)
  # cluster.col: name of the column with clustering to plot
  library(ggplot2)
  ids <- colnames(data)[id.vars.col]
  melted <- melt(data, id.vars = ids)
  if(type=="trajectory"){
    ggplot(melted, aes(x = as.numeric(variable), y = value)) + geom_line(aes(group = label.col)) +
    facet_wrap(as.formula(paste("~",cluster.col))) + stat_summary(fun.y=mean, geom="line", colour = "blue", size = 1.5) + xlab("Time")
  } else if(type=="composition"){
    melted[, c(cluster.col):=as.factor(get(cluster.col))]
    ggplot(melted, aes_string(cluster.col)) + geom_bar(aes(fill=condition.col))
  }
}
EGF <- Yanni[Condition %in% c("E-0.25", "E-2.5", "E-25", "E-250")]
NGF <- Yanni[Condition %in% c("N-0.25", "N-2.5", "N-25", "N-250")]
FGF <- Yanni[Condition %in% c("F-0.25", "F-2.5", "F-25", "F-250")]

cond <- "Condition"
lab <-  "Label"
tim <- "RealTime"
k.clus <- 2L:8L
na.f.raw <- 1200
na.f.nor <- 1.1

Clustering quality is evaluated with the following indices (see ?dtwclust::cvi ; https://pdfs.semanticscholar.org/a522/fb4646ad19fe88893b90e6fbc1faa1470976.pdf):

  • CH: Calinski-Harabasz index, to MAXIMIZE
  • COP: COP index, to MINIMIZE
  • D: Dunn index, to MAXIMIZE
  • DB: Davis-Bouldin, to MINIMIZE
  • DBstar: modified Davis-Bouldin, to MINIMIZE
  • SF: Score Function, to MAXIMIZE
  • Sil: Silhouette index, to MAXIMIZE

Results with these indices is very often unclear and contradictory, inspecting the cluster composition can help. In a setting where one concentration gives rise to an unique response, we would expect a good clustering to contain clusters with at least 25% of the observations (since there is 4 concentrations) that are enriched/depleted in at least one concentration.

2.3.1 EGF

# With unormalized and unsmoothed data
clust_EGF_1 <- CastCluster(EGF, time.col = tim, condition.col = cond,  label.col = lab, k.clust = k.clus, na.fill = na.f.raw, plot = F, measure.col = "Ratio")
# With normalized and unsmoothed data
clust_EGF_2 <- CastCluster(EGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm")
# With normalized and smoothed data
clust_EGF_3 <- CastCluster(EGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm.smooth")

Advised number of clusters:

raw <- c(apply(clust_EGF_1$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_EGF_1$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
norm <- c(apply(clust_EGF_2$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_EGF_2$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
smooth <- c(apply(clust_EGF_3$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_EGF_3$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
rbind(raw, norm, smooth)
##        CH D SF Sil COP DB DBstar
## raw     3 8  2   2   8  3      3
## norm    2 5  2   2   7  2      2
## smooth  2 6  2   2   6  2      2

plot_cluster(clust_EGF_3$table, id.vars.col = 1:9, cluster.col = "k_2", type = "trajectory")
## Warning: Removed 21 rows containing non-finite values (stat_summary).

# Cluster composition with Raw data
library(gridExtra)
for(i in k.clus){
  name <- paste0("k_",i)
  base::assign(paste0("q",i), plot_cluster(clust_EGF_1$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)

# With normalized smoothed data
for(i in k.clus){
  name <- paste0("k_",i)
  base::assign(paste0("q",i), plot_cluster(clust_EGF_3$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)

Each cluster systematically contains the 4 concentrations, very low separabilty –> 1 type of response whatever the concentration.

2.3.2 NGF

clust_NGF_1 <- CastCluster(NGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.raw, plot = F, measure.col = "Ratio")
clust_NGF_2 <- CastCluster(NGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm")
clust_NGF_3 <- CastCluster(NGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm.smooth")

Advised number of clusters:

raw <- c(apply(clust_NGF_1$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_NGF_1$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
norm <- c(apply(clust_NGF_2$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_NGF_2$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
smooth <- c(apply(clust_NGF_3$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_NGF_3$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
rbind(raw, norm, smooth)
##        CH D SF Sil COP DB DBstar
## raw     2 7  2   2   8  2      2
## norm    2 8  2   2   8  2      2
## smooth  2 8  2   2   8  2      2

plot_cluster(clust_NGF_3$table, id.vars.col = 1:9, cluster.col = "k_2", type = "trajectory")
## Warning: Removed 41 rows containing non-finite values (stat_summary).

# Cluster composition with Raw data
library(gridExtra)
for(i in k.clus){
  name <- paste0("k_",i)
  base::assign(paste0("q",i), plot_cluster(clust_NGF_1$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)

# With normalized smoothed data
for(i in k.clus){
  name <- paste0("k_",i)
  base::assign(paste0("q",i), plot_cluster(clust_NGF_3$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)

Each cluster systematically contains the 4 concentrations, very low separabilty –> 1 type of response whatever the concentration.

2.3.3 FGF

clust_FGF_1 <- CastCluster(FGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.raw, plot = F, measure.col = "Ratio")
clust_FGF_2 <- CastCluster(FGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm")
clust_FGF_3 <- CastCluster(FGF, time.col = tim, condition.col = cond, label.col = lab, k.clust = k.clus, na.fill = na.f.nor, plot = F, measure.col = "Ratio.norm.smooth")

Advised number of clusters:

raw <- c(apply(clust_FGF_1$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_FGF_1$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
norm <- c(apply(clust_FGF_2$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_FGF_2$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
smooth <- c(apply(clust_FGF_3$quality[c("CH","D","SF","Sil"),], 1, which.max) + 1, apply(clust_FGF_3$quality[c("COP","DB","DBstar"),], 1, which.min) + 1)
rbind(raw, norm, smooth)
##        CH D SF Sil COP DB DBstar
## raw     2 8  2   2   8  2      2
## norm    2 8  2   2   8  2      2
## smooth  2 8  2   2   8  2      2

plot_cluster(clust_FGF_3$table, id.vars.col = 1:9, cluster.col = "k_2", type = "trajectory")
## Warning: Removed 11 rows containing non-finite values (stat_summary).

# Cluster composition with Raw data
library(gridExtra)
for(i in k.clus){
  name <- paste0("k_",i)
  base::assign(paste0("q",i), plot_cluster(clust_FGF_1$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)

# With normalized smoothed data
for(i in k.clus){
  name <- paste0("k_",i)
  base::assign(paste0("q",i), plot_cluster(clust_FGF_3$table, id.vars.col = 1:9, cluster.col = name, type="composition"))
}
grid.arrange(q2,q3,q4,q5,q6,q7,q8, ncol =4, nrow=2)

Separability seems better than for EGF and NGF, notably concentration 0.25 and 250 tend to be taken apart.

2.4 Choose number of clusters with the gap statistics

The gap statistics is an heuristic way of estimating the number of clusters. It compares the variance explained by our clustering to the expected explained variance if there was no cluster but just noise in that specific space region. It is very computational costly but has the major advantage to also indicate whether clustering is even relevant in the first place (i.e. k=1 should be kept). See: https://datasciencelab.wordpress.com/2013/12/27/finding-the-k-in-k-means-clustering/

Prototype script:

modtsclust <- function(data, k, type, centroid){
  # Interface function compatible with clusGap, returns only the result of the clustering
  require(dtwclust)
  temp <- tsclust(data, k = k, type = type, centroid = centroid)
  return(list(cluster = temp@cluster))
}
kmax = 6L

require(cluster)
temp <- copy(EGF)
temp <- dcast(temp, Label ~ RealTime, value.var = "Ratio.norm.smooth")
temp <- as.matrix(temp[, -1]) # remove first columns with labels
temp[which(is.na(temp))] <- 1.1
cgap_EGF_smooth <- clusGap(temp, FUN=modtsclust, K.max=kmax, B=50, centroid = "pam", type = "partitional")

temp <- copy(NGF)
temp <- dcast(temp, Label ~ RealTime, value.var = "Ratio")
temp <- as.matrix(temp[, -1]) # remove first columns with labels
temp[which(is.na(temp))] <- 1.1
cgap_NGF <- clusGap(temp, FUN=modtsclust, K.max=kmax, B=100, centroid = "pam", type = "partitional")

temp <- copy(FGF)
temp <- dcast(temp, Label ~ RealTime, value.var = "Ratio.norm.smooth")
temp <- as.matrix(temp[, -1]) # remove first columns with labels
temp[which(is.na(temp))] <- 1.1
cgap_FGF_smooth <- clusGap(temp, FUN=modtsclust, K.max=kmax, B=100, centroid = "pam", type = "partitional")


for(k in 1:(kmax-1)) {
    if(cgap_FGF_smooth$Tab[k,3]>cgap_FGF_smooth$Tab[(k+1),3]-cgap_FGF_smooth$Tab[(k+1),4]) {print(k)}; 
    break;
}
# Load results
library(cluster)
temp <- "C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/"
temp <- paste0(temp, dir(temp))
sapply(temp, load, .GlobalEnv)
##   C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_EGF_norm.Robj 
##                                                                               "cgap_EGF_norm" 
##    C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_EGF_raw.Robj 
##                                                                                "cgap_EGF_raw" 
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_EGF_smooth.Robj 
##                                                                             "cgap_EGF_smooth" 
##   C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_FGF_norm.Robj 
##                                                                               "cgap_FGF_norm" 
##    C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_FGF_raw.Robj 
##                                                                                "cgap_FGF_raw" 
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_FGF_smooth.Robj 
##                                                                             "cgap_FGF_smooth" 
##   C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_NGF_norm.Robj 
##                                                                               "cgap_NGF_norm" 
##    C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_NGF_raw.Robj 
##                                                                                "cgap_NGF_raw" 
## C:/Users/pixel/Desktop/Lectures/Semester_3/Master_thesis/gap_stat_output/cgap_NGF_smooth.Robj 
##                                                                             "cgap_NGF_smooth"

2.5 Rerun clustering with shorter time series (centered on the peak)

EGF_short <- EGF[RealTime >= 25 & RealTime <= 100]
FGF_short <- FGF[RealTime >= 25 & RealTime <= 100]
NGF_short <- NGF[RealTime >= 25 & RealTime <= 100]

2.5.1 EGF

##        CH D SF Sil COP DB DBstar
## raw     2 8  2   2   8  2      2
## norm    2 6  2   2   8  2      2
## smooth  2 7  2   2   6  2      2
## Warning in melt.data.table(as.data.table(clust_EGF_1$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_EGF_2$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_EGF_3$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.

## Warning: Removed 6 rows containing non-finite values (stat_summary).

Does a much better job at separating the concentrations. Advantages of normalized data become clear.

2.5.2 NGF

##        CH D SF Sil COP DB DBstar
## raw     2 6  2   2   8  2      2
## norm    2 8  2   2   8  2      2
## smooth  2 8  2   2   8  2      2
## Warning in melt.data.table(as.data.table(clust_NGF_1$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_NGF_2$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_NGF_3$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.

## Warning: Removed 33 rows containing non-finite values (stat_summary).
## Warning: Removed 1 rows containing missing values (geom_path).

Still impossible to separate the concentrations properly.

2.5.3 FGF

##        CH D SF Sil COP DB DBstar
## raw     2 8  2   2   8  2      2
## norm    2 3  2   2   8  2      2
## smooth  2 8  2   2   8  2      2
## Warning in melt.data.table(as.data.table(clust_FGF_1$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_FGF_2$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(as.data.table(clust_FGF_3$quality)): To be
## consistent with reshape2's melt, id.vars and measure.vars are internally
## guessed when both are 'NULL'. All non-numeric/integer/logical type columns
## are conisdered id.vars, which in this case are columns []. Consider
## providing at least one of 'id' or 'measure' vars in future.

## Warning: Removed 6 rows containing non-finite values (stat_summary).
## Warning: Removed 1 rows containing missing values (geom_path).

Very clear separation is performed.

2.6 Visualization in PCA space

cast_EGF_short <- myCast(EGF_short, tim, cond, lab, "Ratio.norm.smooth", na.fill = 1.1)
# Replace the big outlier that is driving the 2nd component
cast_EGF_short$casted.matrix[which.max(cast_EGF_short$casted.matrix)] <- 1.1
pca_EGF_short <- prcomp(cast_EGF_short$casted.matrix, center = T)


library(ggbiplot)
## Loading required package: plyr
## Loading required package: scales
## Loading required package: grid
g <- ggbiplot(pca_EGF_short, obs.scale = 1, var.scale = 1, 
              groups = unlist(cast_EGF_short$casted[,1]), ellipse = TRUE, 
              circle = TRUE)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal', 
               legend.position = 'top')
print(g)

cast_NGF_short <- myCast(NGF_short, tim, cond, lab, "Ratio.norm.smooth", na.fill = 1.1)
pca_NGF_short <- prcomp(cast_NGF_short$casted.matrix, center = T)


library(ggbiplot)
g <- ggbiplot(pca_NGF_short, obs.scale = 1, var.scale = 1, 
              groups = unlist(cast_NGF_short$casted[,1]), ellipse = TRUE, 
              circle = TRUE)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal', 
               legend.position = 'top')
print(g)

cast_FGF_short <- myCast(FGF_short, tim, cond, lab, "Ratio.norm.smooth", na.fill = 1.1)
pca_FGF_short <- prcomp(cast_FGF_short$casted.matrix, center = T)


library(ggbiplot)
g <- ggbiplot(pca_FGF_short, obs.scale = 1, var.scale = 1, 
              groups = unlist(cast_FGF_short$casted[,1]), ellipse = TRUE, 
              circle = TRUE)
g <- g + scale_color_discrete(name = '')
g <- g + theme(legend.direction = 'horizontal', 
               legend.position = 'top')
print(g)

The PCA plots show the same separability properties as clustering does. In addition the correlation circle confirms that data gets separated after the peak. Note that the 2nd component is much more important in FGF.

3 Separability of snapshots between concentrations of the same GF

Idea: For a given GF, at each time point and for each concentration get the signal distribution, check if they overlap well between different concentrations. Overlap can be measured by KS statistics, Bhattacharryya index, Kullback–Leibler…

However this implicitly assumes that there’s no subpopulation within a given condition (GF + concentration).

3.1 Raw values

sep.meas.along.time <- function(data1, data2, time.col, measure.col){
  timev <- unique(data1[, get(time.col)])
  if(!(identical(unique(data2[, get(time.col)]), timev))) stop("Time vectors must be identical between the two data")
  out <- separability.measures(data1[get(time.col)==timev[1], get(measure.col)], data2[get(time.col)==timev[1], get(measure.col)])
  for(t in timev[2:length(timev)]){
    out <- rbind(out, separability.measures(data1[RealTime==t, get(measure.col)], data2[RealTime==t, get(measure.col)]))
  }
  out <- cbind(timev, out)
  return(out)
}

library(plyr)

# Get all pairs of conditions
conditions <- combn(as.character(unique(Yanni[,Condition])), m = 2)
conditions <- conditions[,c(1:3,12,13,22, 39:41,46,47,52, 61:66)]

# Compute separabilities of conditions at each time point
sep.meas.raw <- apply(conditions, 2, function(x) sep.meas.along.time(Yanni[Condition==x[1]],  Yanni[Condition==x[2]], "RealTime", "Ratio" ))
names(sep.meas.raw) <- apply(conditions, 2, function(x) paste(x[1], x[2], sep = ","))

# Go to data table
for(i in 1:length(sep.meas.raw)){
  temp <- unlist(strsplit(names(sep.meas.raw)[i], ","))
  sep.meas.raw[[i]]$Cond1 <- temp[1]
  sep.meas.raw[[i]]$Cond2 <- temp[2]
}

sep.meas.raw <- as.data.table(rbind.fill(sep.meas.raw))
sep.meas.raw[, c("Cond1", "Cond2") := list(as.factor(Cond1), as.factor(Cond2))]
ggplot(sep.meas.raw, aes(x= timev, y = jm)) + geom_line() + facet_wrap(~Cond1 + Cond2) + theme(text=element_text(size=25)) + ylab("Jeffries-Matusita [0, sqrt(2)]") + geom_vline(xintercept=40, colour="blue", linetype="longdash")

Can sum up these curves with area under curve:

max.val <- sqrt(2) * 101
auc.raw <- sep.meas.raw[, .(auc = sum(jm, na.rm = T)/max.val), by = c("Cond1", "Cond2")] # a few NAs, slight bias in the values
auc.raw
##      Cond1 Cond2         auc
##  1: E-0.25 E-2.5 0.086930074
##  2: E-0.25  E-25 0.132884272
##  3: E-0.25 E-250 0.183366617
##  4:  E-2.5  E-25 0.038354825
##  5:  E-2.5 E-250 0.060566879
##  6:   E-25 E-250 0.023626610
##  7: F-0.25 F-2.5 0.022514830
##  8: F-0.25  F-25 0.089739650
##  9: F-0.25 F-250 0.296925989
## 10:  F-2.5  F-25 0.060904139
## 11:  F-2.5 F-250 0.255272578
## 12:   F-25 F-250 0.093969100
## 13: N-0.25 N-2.5 0.031767259
## 14: N-0.25  N-25 0.039475775
## 15: N-0.25 N-250 0.033815373
## 16:  N-2.5  N-25 0.009405366
## 17:  N-2.5 N-250 0.008092665
## 18:   N-25 N-250 0.014956489

The indicator seems to show the differences in behaviour really well: NGF curves are completely flat, indicating that there is few deviations between 2 concentrations. As for NGF and EGF 2 about clusters appear between bahaviours at low versus behaviours at high concentrations.

The peaks observed aorund 40min, can either come from differences in excitation level or from a shift between series, this needs to be checked. If the prior was true, should perform a multiple alignment within each concentration prior to compute the distances.

3.2 Normalized valued

sep.meas.norm <- apply(conditions, 2, function(x) sep.meas.along.time(Yanni[Condition==x[1]],  Yanni[Condition==x[2]], "RealTime", "Ratio.norm" ))
names(sep.meas.norm) <- apply(conditions, 2, function(x) paste(x[1], x[2], sep = ","))

# Go to data table
for(i in 1:length(sep.meas.norm)){
  temp <- unlist(strsplit(names(sep.meas.norm)[i], ","))
  sep.meas.norm[[i]]$Cond1 <- temp[1]
  sep.meas.norm[[i]]$Cond2 <- temp[2]
}

sep.meas.norm <- as.data.table(rbind.fill(sep.meas.norm))
sep.meas.norm[, c("Cond1", "Cond2") := list(as.factor(Cond1), as.factor(Cond2))]
ggplot(sep.meas.norm, aes(x= timev, y = jm)) + geom_line() + facet_wrap(~Cond1 + Cond2) + theme(text=element_text(size=25)) + ylab("Jeffries-Matusita [0, sqrt(2)]") + geom_vline(xintercept=40, colour="blue", linetype="longdash")

auc.norm <- sep.meas.norm[, .(auc = sum(jm, na.rm = T)/max.val), by = c("Cond1", "Cond2")] # a few NAs, slight bias in the values
auc.norm
##      Cond1 Cond2        auc
##  1: E-0.25 E-2.5 0.09572728
##  2: E-0.25  E-25 0.13078027
##  3: E-0.25 E-250 0.24245259
##  4:  E-2.5  E-25 0.05833530
##  5:  E-2.5 E-250 0.09841061
##  6:   E-25 E-250 0.05160538
##  7: F-0.25 F-2.5 0.02399241
##  8: F-0.25  F-25 0.14924847
##  9: F-0.25 F-250 0.30922374
## 10:  F-2.5  F-25 0.11881887
## 11:  F-2.5 F-250 0.29071881
## 12:   F-25 F-250 0.08043974
## 13: N-0.25 N-2.5 0.02954189
## 14: N-0.25  N-25 0.03388359
## 15: N-0.25 N-250 0.03452136
## 16:  N-2.5  N-25 0.01081681
## 17:  N-2.5 N-250 0.01714004
## 18:   N-25 N-250 0.01339676

Seems that the normalization makes no good to the metric by amplifying small differences (due to shift)?

3.3 Normalized and smoothed

sep.meas.smooth <- apply(conditions, 2, function(x) sep.meas.along.time(Yanni[Condition==x[1]],  Yanni[Condition==x[2]], "RealTime", "Ratio.norm.smooth" ))
names(sep.meas.smooth) <- apply(conditions, 2, function(x) paste(x[1], x[2], sep = ","))

# Go to data table
for(i in 1:length(sep.meas.smooth)){
  temp <- unlist(strsplit(names(sep.meas.smooth)[i], ","))
  sep.meas.smooth[[i]]$Cond1 <- temp[1]
  sep.meas.smooth[[i]]$Cond2 <- temp[2]
}

sep.meas.smooth <- as.data.table(rbind.fill(sep.meas.smooth))
sep.meas.smooth[, c("Cond1", "Cond2") := list(as.factor(Cond1), as.factor(Cond2))]
ggplot(sep.meas.smooth, aes(x= timev, y = jm)) + geom_line() + facet_wrap(~Cond1 + Cond2) + theme(text=element_text(size=25)) + ylab("Jeffries-Matusita [0, sqrt(2)]") + geom_vline(xintercept=40, colour="blue", linetype="longdash")

auc.smooth <- sep.meas.smooth[, .(auc = sum(jm, na.rm = T)/max.val), by = c("Cond1", "Cond2")] # a few NAs, slight bias in the values
auc.smooth
##      Cond1 Cond2        auc
##  1: E-0.25 E-2.5 0.10127149
##  2: E-0.25  E-25 0.13575894
##  3: E-0.25 E-250 0.25643512
##  4:  E-2.5  E-25 0.05960717
##  5:  E-2.5 E-250 0.11217274
##  6:   E-25 E-250 0.05840472
##  7: F-0.25 F-2.5 0.06696335
##  8: F-0.25  F-25 0.14808925
##  9: F-0.25 F-250 0.31876715
## 10:  F-2.5  F-25 0.08271862
## 11:  F-2.5 F-250 0.18465569
## 12:   F-25 F-250 0.08339709
## 13: N-0.25 N-2.5 0.04005771
## 14: N-0.25  N-25 0.03978603
## 15: N-0.25 N-250 0.04588730
## 16:  N-2.5  N-25 0.01082399
## 17:  N-2.5 N-250 0.01855943
## 18:   N-25 N-250 0.01409727

Same as for normalization, probably better to stick to raw data if possible.

3.4 To go further: bootstraps and permutation tests

3.4.1 Permutation tests

Tells us if the observed area under the curve is really related to a difference between the two treatments or if it is spurious. If there is n trajectories recorded in condition 1 and m in condition 2, the permutation test is performed like so:

  • Randomly assign n trajectories to group 1
  • Assign the rest to group 2
  • Compute the auc
  • Repeat

We then look if the observed auc is likely in this empirical distribution.

one.permutation.auc <- function(x, y, metric){
  n <- nrow(x)
  m <- nrow(y)
  temp <- rbind(x, y)
  samp.traj <- sample(1:nrow(temp), size = n, replace = FALSE)
  x.resamp <- temp[samp.traj, ]
  y.resamp <- temp[setdiff(1:nrow(temp), samp.traj), ]

  seps <- sapply(1:ncol(x), function(j) separability.measures(x.resamp[, j], y.resamp[, j]))
  return(sum(unlist(seps[metric, ])))
}

permutation.auc <- function(x, y, n, metric = "jm"){
  # x,y: two matrices representing time series, row: trajectory; col: time
  # n: number of permutations
  # metric: one of "jm", "bh", "div", "tdiv", "ks"
  if(ncol(x) != ncol(y)) stop("x and y must have same number of columns")
  return(replicate(n, one.permutation.auc(x,y,metric)))
}
wrap_perm <- function(x, y, meas, n){
  a <- myCast(x, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
  b <- myCast(y, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
  return(permutation.auc(a, b, n))
}

temp <- apply(conditions, 2, function(x) wrap_perm(Yanni[Condition==x[1]], Yanni[Condition==x[2]], "Ratio", 500))
temp <- temp/max.val
colnames(temp) <- apply(conditions, 2, paste, collapse=" ; ")
par(mfrow=c(6,3))
for(j in 1:ncol(temp)){
  hist(temp[,j], main = colnames(temp)[j], ylab="", xlab = "", cex.main = 3, cex.axis = 3)
}

3.4.2 Bootstraps

3.4.2.1 Bootstraps per column (i.e. resample time points)

In order to know whether these area under curves are robust estimates, could perform bootstraps the way it is performed in phylogenetics:

  • Initialize an empty matrix with dimensions of the one representing the time series along time.
  • From the initial matrix representing the trajectories, select a column at random (i.e. one time point)
  • Add this column to the empty matrix

Because we deal here with comparison of 2 populations we can modify this to:

  • Initialize two empty matrices, one for each population
  • Pick a random column index (or a time)
  • Extend both matrices with the column of its corresponding population.

This implies that matrices are of same length and defined at the same times.

one.bootstrap.auc.percol <- function(x, y, metric){
  samp.col <- sample(1:ncol(x), size = ncol(x), replace = TRUE)
  x.resamp <- x[, samp.col]
  y.resamp <- y[, samp.col]
  seps <- sapply(1:ncol(x), function(j) separability.measures(x.resamp[, j], y.resamp[, j]))
  return(sum(unlist(seps[metric, ])))
}

bootstrap.auc.percol <- function(x, y, B, metric = "jm"){
  # x,y: two matrices representing time series, row: trajectory; col: time
  # B: number of boostraps
  # metric: one of "jm", "bh", "div", "tdiv", "ks"
  if(ncol(x) != ncol(y)) stop("x and y must have same number of columns")
  return(replicate(B, one.bootstrap.auc.percol(x,y,metric)))
}
wrap_bootcol <- function(x, y, meas, n){
  a <- myCast(x, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
  b <- myCast(y, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
  return(bootstrap.auc.percol(a, b, n))
}

bootcol <- apply(conditions, 2, function(x) wrap_bootcol(Yanni[Condition==x[1]], Yanni[Condition==x[2]], "Ratio", 500))
bootcol <- bootcol/max.val
colnames(bootcol) <- apply(conditions, 2, paste, collapse=" ; ")
par(mfrow=c(6,3))
for(j in 1:ncol(bootcol)){
  hist(bootcol[,j], main = colnames(bootcol)[j], ylab="", xlab = "", cex.main = 3, cex.axis = 3)
}

We see that the average of the distribution corresponds to the observed value.

3.4.2.2 Bootstraps per row (i.e. resample trajectories)

A lack of robustness here could potentially show the presence of subpopulations, that are missed when simply sum up to AUC.

one.bootstrap.auc.perrow <- function(x, y, metric){
  samp.rowx <- sample(1:nrow(x), size = nrow(x), replace = TRUE)
  samp.rowy <- sample(1:nrow(y), size = nrow(y), replace = TRUE)
  x.resamp <- x[samp.rowx, ]
  y.resamp <- y[samp.rowy, ]
  seps <- sapply(1:ncol(x), function(j) separability.measures(x.resamp[, j], y.resamp[, j]))
  return(sum(unlist(seps[metric, ])))
}

bootstrap.auc.perrow <- function(x, y, B, metric = "jm"){
  # x,y: two matrices representing time series, row: trajectory; col: time
  # B: number of boostraps
  # metric: one of "jm", "bh", "div", "tdiv", "ks"
  if(ncol(x) != ncol(y)) stop("x and y must have same number of columns")
  return(replicate(B, one.bootstrap.auc.perrow(x,y,metric)))
}
wrap_bootrow <- function(x, y, meas, n){
  a <- myCast(x, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
  b <- myCast(y, "RealTime", "Condition", "Label", meas, 1100)$casted.matrix
  return(bootstrap.auc.perrow(a, b, n))
}

bootrow <- apply(conditions, 2, function(x) wrap_bootrow(Yanni[Condition==x[1]], Yanni[Condition==x[2]], "Ratio", 500))
bootrow <- bootrow/max.val
colnames(bootrow) <- apply(conditions, 2, paste, collapse=" ; ")
par(mfrow=c(6,3))
for(j in 1:ncol(bootrow)){
  hist(bootrow[,j], main = colnames(bootrow)[j], ylab="", xlab = "", cex.main = 3, cex.axis = 3)
}

3.4.2.3 Confidence Interval from bootstraps

# Get CI directly from bootstrap quantiles
ci.bootcol <- apply(bootcol, 2, quantile, probs = c(0.025, 0.975))
ci.bootrow <- apply(bootrow, 2, quantile, probs = c(0.025, 0.975))
plot(seq_along(auc.raw), auc.raw, type = "p", ylim = c(0,0.4), main = "Bootcol intervals")
segments(x0=seq_along(auc.raw), y0=ci.bootcol[1,], y1=ci.bootcol[2,])
abline(v=c(6.5,12.5), lty="dashed")
plot(seq_along(auc.raw), auc.raw, type = "p", ylim = c(0,0.4), main = "Bootrow intervals")
segments(x0=seq_along(auc.raw), y0=ci.bootrow[1,], y1=ci.bootrow[2,])
abline(v=c(6.5,12.5), lty="dashed")

4 Cross-Correlations/Coherence between concentrations of the same GF

Idea: For each GF, compare each pair of trajectories that do not belong to the same concentrations. This comparison can be done by cross-correlation/overlap of clipping trajectories, simple correlations… Cross-correlations provides a way to align the signals as best as we can be shifting this is cool to get rid of differences that would come from experimental conditions like we’ve been waiting 5 more minutes to inject the GF.

Could also check coherence (cross-correlation based on spectral densities).

5 Information theory approach: entropy of heatmap / signal

Idea: Quantize the signals and check how complex they are. Pool all concentrations of a single GF together and see how complex the resulting image is. We can quantize the signals and compute the entropy, or do it directly from a heatmap (quantization is then performed by color coding).

# Get the RGB heatmap
library(png)
heatm <- readPNG("C:/Users/pixel/Dropbox/Marc-Antoine/data/set2_Yannick_EGFNGFFGFsust_PC12/heatmap.png")
# Got to greyscale with a simple average of channels